// argument process function set. voidhelp_arg(){ printf("Usage: ./csim-ref [-hv] -s <s> -E <E> -b <b> -t <tracefile>\n"); printf("-h: Optional help flag that prints usage info\n"); printf("-v: Optional verbose flag that displays trace info\n"); printf("-s <s>: Number of set index bits\n"); printf("-E <E>: Associativity (number of lines per set)\n"); printf("-b <b>: Number of block bits\n"); printf("-t <tracefile>: Name of the valgrind trace to replay\n"); return; }
// cache update and init function set. _cache_set_p_ init_cache(){ /* get memory for sets and cache_line * s is Number of set index bits. so S = 2^s is the number of sets. * E is Associativity (number of lines per set) * b is Number of block bits */ int32_t tmp_S = 1 << arg_unit.s; _cache_set_p_ _ret_cache_sets = (_cache_set_p_)malloc(sizeof(_cache_set_) * tmp_S); for (int32_t i = 0; i < tmp_S; ++i){ _ret_cache_sets[i].cache_lines = (_cache_line_ *)malloc(sizeof(_cache_line_) * arg_unit.E); for (int32_t j = 0; j < arg_unit.E; ++j){ _ret_cache_sets[i].cache_lines[j].valid = 0; _ret_cache_sets[i].cache_lines[j].tag = -1; _ret_cache_sets[i].cache_lines[j].stamp = -1; } } return _ret_cache_sets; }
voidcache_update(uint32_t address, _cache_set_p_ cache_sets){ // update cache, use address. // b is the Number of block bits int32_t set_index_address = (address >> arg_unit.b) & ((-1U) >> (64 - arg_unit.s)); int32_t tag_address = address >> (arg_unit.b + arg_unit.s); // hit. for (int32_t i = 0; i < arg_unit.E; ++i){ if (cache_sets[set_index_address].cache_lines[i].tag == tag_address){ cache_sets[set_index_address].cache_lines[i].stamp = 0; cnt_unit.hit_cnt ++; return; } } // miss but have blank line. for (int32_t i = 0; i < arg_unit.E; ++i){ if (cache_sets[set_index_address].cache_lines[i].valid == 0){ cache_sets[set_index_address].cache_lines[i].valid = 1; cache_sets[set_index_address].cache_lines[i].tag = tag_address; cache_sets[set_index_address].cache_lines[i].stamp = 0; cnt_unit.miss_cnt ++; return; } } // miss and need to be replaced. cnt_unit.miss_cnt ++; cnt_unit.eviction_cnt ++; int32_t max_stamp = INT_MIN; int32_t max_stamp_idx = -1; for (int32_t i = 0; i < arg_unit.E; ++i){ if (cache_sets[set_index_address].cache_lines[i].stamp > max_stamp){ max_stamp = cache_sets[set_index_address].cache_lines[i].stamp; max_stamp_idx = i; } } cache_sets[set_index_address].cache_lines[max_stamp_idx].tag = tag_address; cache_sets[set_index_address].cache_lines[max_stamp_idx].stamp = 0; return; } voidcache_update_stamp(_cache_set_p_ cache_sets){ int32_t tmp_S = 1 << arg_unit.s; for (int32_t i = 0; i < tmp_S; ++i){ for (int32_t j = 0; j < arg_unit.E; ++j){ cache_sets[i].cache_lines[j].stamp ++; } } return; }
voidsimulate_cache(){ /* Start simulate. * Read data from file(t). * The format in file is <op + address + size> * "I 0400d7d4,8" * " M 0421c7f0,4" * " L 04f6b868,8" * " S 7ff0005c8,8" * ignore I instruction. */ int32_t tmp_S = 1 << arg_unit.s; FILE* fp = fopen(arg_unit.t, "r"); if(fp == NULL){ printf("open error"); exit(-1); } _cache_set_p_ cache_sets = init_cache(); char op; uint32_t address; int32_t size; while(fscanf(fp, " %c %xu,%d\n", &op, &address, &size) > 0){ switch (op){ case'L': cache_update(address, cache_sets); break; case'M': cache_update(address, cache_sets); case'S': cache_update(address, cache_sets); break; } cache_update_stamp(cache_sets); } fclose(fp); for (int32_t i = 0; i < tmp_S; ++i){ free(cache_sets[i].cache_lines); } free(cache_sets); }
2 Optimizing matrix transpose
首先测试的前提是 cache 32-set,each set has only one cache line,and each cache line has 32 bytes. 最多使用 12 个变量
2.1 32 x 32
先使用最原始的方法:
1 2 3 4 5 6 7 8 9
if (M == 32){ int tmp; for (int i = 0; i < N; i++) { for (int j = 0; j < M; j++) { tmp = A[i][j]; B[j][i] = tmp; } } }
Summary for official submission (func 0): correctness=1 misses=1183
TEST_TRANS_RESULTS=1:1183
可以看到 miss 数达到了惊人的 1183 个。
8x8 的分块
对于 32 x 32 的矩阵非常容易想到可以使用 8 x 8 的分块。因为 cache line 一行能够存储的一共就是 8 Byte,在 32 x 32 的矩阵中,一行有 32 个 int, 一共就是需要 4 个 cache line 可以,那么所有 cache 可以装下矩阵的 8 行。所以这里使用 8 x 8 分块。
1 2 3 4 5 6 7
if (M == 32){ for (int i = 0; i < N; i += 8) for (int j = 0; j < M; j += 8) for (int m = i; m < i + 8; ++m) for (int n = j; n < j + 8; ++n) B[n][m] = A[m][n]; }